## This file loads and cleans the manually digitized QFR data to generate a 
##  usable time series starting in 1966. If you use these data, please cite
##  "Financial Constraints, Sectoral Heterogeneity, and the Cyclicality of 
##  Investment" by Cooper Howes. More information can be found in the 
##  file "QFR_data_readme".


####------------------ DISCLAIMER -----------------------------####

## These data are assembled entirely from public QFR reports. While 
##  I have done my best to ensure their accuracy, it is possible that 
##  mistakes remain, and neither the historical digitized data nor the 
##  cleaning program have been verified or endorsed in any capacity by 
##  the Census Bureau or any other organization.

####-----------------------------------------------------------####



##### Install packages and set WD #####

## Clear variables and data
rm(list=ls())

## Install packages. Note that R has several different "lag" functions split 
##  across various packages. This program requires the one from the dplyr package.
library(tidyverse)
library(readxl)

## Set working directory
setwd("C:/Users/Cooper/Google Drive/Journal submission/Investment cyclicality/REStat/QFR data cleaning")


## Create empty structures for real and nominal data as well as ratios
nom_data <- NULL
real_data <- NULL
raw_data <- NULL

## Specify starting and ending years. The default beginning will be fixed at 1966,
##  since that's the first quarter where the data are digitized, but it's possible
##  to extend the end year by using more recent QFR data.
startyear <- 1966
endyear <- 2021

## Create matrix of which rows of the spreadsheet to read
## Needs to change for the "eras" of the data structure.  Periods are:
## 1) Start-73Q4
## 2) 74Q1-80Q3
## 3) Small (or all) 80Q4-87Q4
## 4) Big 80Q4-87Q4
## 5) Publicly available data (post-87)
## 6) 2001Q3 SIC/NAICS conversion sheet small
## 7) 2001Q3 SIC/NAICS conversion sheet big

nppe_rows       <- c("26","52","40","52","45","41","49")
sales_rows      <- c("2" ,"5" ,"4" ,"4" ,"4" ,"4" ,"4")
liab_rows       <- c("40","70","54","70","65","55","69")
cur_liab_rows   <- c("36","65","50","65","60","51","64")
dividend_rows   <- c("9" ,"20","12","19","20","13","20")
equity_rows     <- c("44","74","57","74","71","58","75")

## Formally, this is "post-tax income". A more thorough accounting of 
##  "net income" that includes extraordinary gains and losses is available
##  for some categories of firms for some time periods, but not for all.
net_income_rows <- c("8" ,"15","11","14","16","12","16")

## Create list of all of these variable prefixes
qfr_var_names <- c("sales","nppe","liab","cur_liab","dividend","equity","net_income")

## Need to create matrices for Excel columns
##  The first matrix row is for all firms
##  The second matrix row is for big firms (small calculated as residual)
##  The third and fourth are for durables (small and large pre-81, total and large post-81)
##  The fifth and sixth are for nondurables (small and large pre-81, total and large post-81)
##  Column 1 is start and column 2 is end
cols_66 <- matrix(c("B","F","BD","BH","BJ","BN","BP","BT","BV","BZ","CB","CF"),ncol=2,byrow=TRUE)
cols_77 <- matrix(c("B","F","AX","BB","BD","BH","BJ","BN","BP","BT","BV","BZ"),ncol=2,byrow=TRUE)
cols_81 <- matrix(c("B","F","FI","FM","N","R","FU","FY","H","L","FO","FS"),ncol=2,byrow=TRUE)

## For later years it's just a list
## Order is all, all big, all dur, big dur, all nondur, big nondur
cols_88 <- c("CU","CT","CF","CE","BR","BQ")
cols_92 <- c("DG","DF","CR","CQ","CD","CC")
cols_00 <- c("DK","DJ","CZ","CY","DW","DV")
cols_09 <- c("DU","DT","DI","DH","EG","EF")

## Create groupings of tabs with same layout
range_66 <- c("66","67","68","69","70","71","72","73Q4","74","75","76")
range_77 <- c("77","78","79","80")
range_81 <- c("81","82part1","82part2","82part3","83","83part2","84","85","86","87")

## Combine all old tab names into one list
oldtabnames <- c(range_66,range_77,range_81)

## Create list of post-1988 tab names
newtabnames <- NULL
for(i in 1988:endyear){
  newtabnames <- c(newtabnames,paste0(i,c("Q1","Q2","Q3","Q4")))
}

## This function generates a column number from an Excel column name 
##  (ie AA maps to column 27)

## Input: A string of letters s
## Output: Corresponding column number
LettersToNumbers <- function(s){
  ## Uppercase
  s_upper <- toupper(s)
  ## Convert string to a vector of single letters
  s_split <- unlist(strsplit(s_upper, split=""))
  ## Convert each letter to the corresponding number
  s_number <- sapply(s_split, function(x) {which(LETTERS == x)})
  ## Derive the numeric value associated with each letter
  numbers <- 26^((length(s_number)-1):0)
  ## Calculate the column number
  column_number <- sum(s_number * numbers)
  return(column_number)
}

## Vectorize in case you want to pass more than one column name in a single call
LettersToNumbers <- Vectorize(LettersToNumbers)


## Load aggregate time series data to provide price indices
macrodata <- ts(read.csv("Aggregate_data.csv"),start=c(1966,1),end=c(2021,4),frequency=4)



##### Step 1: Convert raw digitized data into consistent time series #####


## Outer loop over which variables you want to combine
for(qfrvar in qfr_var_names){
  
  ## Replace this with the one you want
  eval(parse(text=paste0("var_rows <- ",qfrvar,"_rows")))
  
  growth <- NULL
  levels <- NULL
  
  ## Loop to combine everything from the old data
  for(i in oldtabnames){
    
    ## Calculates growth rates from 1967Q2-1976Q4
    if(i %in% range_66){
      
      ## 1974-1976 have same column range but different row for NPPE
      if(i %in% c("74","75","76")){z <- 2} else {z <- 1}
      
      ## The .name_repair portion of the code suppresses messages about naming columns when loading Excel data
      tempdata_all <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                       range=paste0(cols_66[1,1],var_rows[z],":",cols_66[1,2],var_rows[z]),
                             .name_repair = "unique_quiet"))
      
      tempdata_big <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                       range=paste0(cols_66[2,1],var_rows[z],":",cols_66[2,2],var_rows[z]),
                             .name_repair = "unique_quiet"))
      
      tempdata_small <- tempdata_all - tempdata_big
      
      tempdata_big_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                           range=paste0(cols_66[4,1],var_rows[z],":",cols_66[4,2],var_rows[z]),
                                 .name_repair = "unique_quiet"))
      
      tempdata_small_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                             range=paste0(cols_66[3,1],var_rows[z],":",cols_66[3,2],var_rows[z]),
                                   .name_repair = "unique_quiet"))
      
      tempdata_big_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                              range=paste0(cols_66[6,1],var_rows[z],":",cols_66[6,2],var_rows[z]),
                                    .name_repair = "unique_quiet"))
      
      tempdata_small_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                                range=paste0(cols_66[5,1],var_rows[z],":",cols_66[5,2],var_rows[z]),
                                      .name_repair = "unique_quiet"))
      
      tempdata_all_dur <- tempdata_big_dur + tempdata_small_dur
      tempdata_all_nondur <- tempdata_big_nondur + tempdata_small_nondur
      
      if(i=="73Q4"){
        tempdata_all <- tempdata_all[2:5]
        tempdata_big <- tempdata_big[2:5]
        tempdata_small <- tempdata_small[2:5]
        tempdata_all_dur <- tempdata_all_dur[2:5]
        tempdata_big_dur <- tempdata_big_dur[2:5]
        tempdata_small_dur <- tempdata_small_dur[2:5]
        tempdata_all_nondur <- tempdata_all_nondur[2:5]
        tempdata_big_nondur <- tempdata_big_nondur[2:5]
        tempdata_small_nondur <- tempdata_small_nondur[2:5]
      }
      
    }
    
    
    ## Calculates growth rates from 1977Q1-1980Q4
    if(i %in% range_77){
      
      tempdata_all <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                       range=paste0(cols_77[1,1],var_rows[2],":",cols_77[1,2],var_rows[2]),
                                       .name_repair = "unique_quiet"))
      
      tempdata_big <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                       range=paste0(cols_77[2,1],var_rows[2],":",cols_77[2,2],var_rows[2]),
                                       .name_repair = "unique_quiet"))
      
      tempdata_small <- tempdata_all - tempdata_big
      
      tempdata_big_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                           range=paste0(cols_77[4,1],var_rows[2],":",cols_77[4,2],var_rows[2]),
                                           .name_repair = "unique_quiet"))
      
      tempdata_small_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                             range=paste0(cols_77[3,1],var_rows[2],":",cols_77[3,2],var_rows[2]),
                                             .name_repair = "unique_quiet"))
      
      tempdata_big_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                              range=paste0(cols_77[6,1],var_rows[2],":",cols_77[6,2],var_rows[2]),
                                              .name_repair = "unique_quiet"))
      
      tempdata_small_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                                range=paste0(cols_77[5,1],var_rows[2],":",cols_77[5,2],var_rows[2]),
                                                .name_repair = "unique_quiet"))
      
      tempdata_all_dur <- tempdata_big_dur + tempdata_small_dur
      tempdata_all_nondur <- tempdata_big_nondur + tempdata_small_nondur
      
    }
    
    ## Calculates growth rates from 1981Q1-1987Q1
    ## Note that the "all" series uses a different row than the "big" series here
    if(i %in% range_81){
      
      tempdata_all <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                       range=paste0(cols_81[1,1],var_rows[3],":",cols_81[1,2],var_rows[3]),
                                       .name_repair = "unique_quiet"))
      
      tempdata_big <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                       range=paste0(cols_81[2,1],var_rows[4],":",cols_81[2,2],var_rows[4]),
                                       .name_repair = "unique_quiet"))
      
      tempdata_small <- tempdata_all - tempdata_big
      
      # Durables
      tempdata_all_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                           range=paste0(cols_81[3,1],var_rows[3],":",cols_81[3,2],var_rows[3]),
                                           .name_repair = "unique_quiet"))
      
      tempdata_big_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                           range=paste0(cols_81[4,1],var_rows[4],":",cols_81[4,2],var_rows[4]),
                                           .name_repair = "unique_quiet"))
      
      tempdata_small_dur <- tempdata_all_dur - tempdata_big_dur
      
      # Nondurables
      tempdata_all_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                              range=paste0(cols_81[5,1],var_rows[3],":",cols_81[5,2],var_rows[3]),
                                              .name_repair = "unique_quiet"))
      
      tempdata_big_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet=i,col_names=FALSE,
                                              range=paste0(cols_81[6,1],var_rows[4],":",cols_81[6,2],var_rows[4]),
                                              .name_repair = "unique_quiet"))
      
      tempdata_small_nondur <- tempdata_all_nondur - tempdata_big_nondur
      
      
      ## Need to do some extra adjustments here to take into account overlapping observations
      
      # Set ranges
      rg <- 1:5
      if(i=="81"){ rg <- 1:4 } # 81 covers growth rates in 1981Q1-1981Q3
      if(i=="82part1"){ rg <- 1:2 } # 82part1 covers growth rates in 1981Q4
      if(i=="82part2"){ rg <- 1:3 } # 82part2 covers growth rates in 1982Q1-1982Q2
      if(i=="82part3"){ rg <- 2:5 } # 82part3 covers growth rates in 1982Q3-1983Q1
      if(i=="83"){ rg <- 1:3 } # 83 covers growth rates in 1983Q2-1983Q3
      if(i=="83part2"){ rg <- 1:3 } # 83part2 covers growth rates in 1983Q4-1984Q1
      
      
      tempdata_all <- tempdata_all[rg]
      tempdata_big <- tempdata_big[rg]
      tempdata_small <- tempdata_small[rg]
      tempdata_all_dur <- tempdata_all_dur[rg]
      tempdata_big_dur <- tempdata_big_dur[rg]
      tempdata_small_dur <- tempdata_small_dur[rg]
      tempdata_all_nondur <- tempdata_all_nondur[rg]
      tempdata_big_nondur <- tempdata_big_nondur[rg]
      tempdata_small_nondur <- tempdata_small_nondur[rg]
      
      # The rest are back to normal, so they just use the regular tempdata
      # 84 covers growth rates in 1984Q2-1985Q1
      # 85 covers growth rates in 1985Q2-1986Q1
      # 86 covers growth rates in 1986Q2-1987Q1
      # 87 covers growth rates in 1987Q2-1988Q1
      
    }
    
    tempgrowth <- cbind(tempdata_all,tempdata_big,tempdata_small,
                        tempdata_all_dur,tempdata_big_dur,tempdata_small_dur,
                        tempdata_all_nondur,tempdata_big_nondur,tempdata_small_nondur)
    
    tempgrowth <- unname(tempgrowth)
    
    ## Make sure this specifies the dplyr version of lag!
    growthmat <- apply(tempgrowth,FUN=function(x) x/dplyr::lag(x,1),MARGIN=2)
    growthmat <- growthmat[-1,]
    
    ## Calculate levels for ratio calculations
    levelmat <- tempgrowth[-1,]
    
    growth <- rbind(growth,growthmat)
    levels <- rbind(levels,levelmat)
    
    
  }
  
  
  ## growth is the matrix giving growth rates from 66Q2-88Q1 (88 observations)
  growth <- unname(growth)
  levels <- unname(levels)
  
  
  ###### Now, import levels data from previous adjusted sheet
  
  ## Create empty level matrix
  var_level <- matrix(0,nrow=((endyear-startyear+1)*4),ncol=9)
  
  ## Create empty "raw" level matrix that will be used for calculating ratios
  raw_level <- var_level
  
  ## Fill in pre-88 level observations for the "raw" version
  raw_level[(1:(dim(levels)[1])),] <- levels
  
  ## Fill in the later  years with the actual values starting with 1988Q1
  for(i in 1:length(newtabnames)){
    
    tempdate <- as.numeric(substr(newtabnames[i],start=1,stop=4))
    
    if(tempdate<1992) {cols <- cols_88}
    if(tempdate>=1992 & tempdate<=2000) {cols <- cols_92}
    if(i==52) {cols <- cols_00} ## Need to treat 2000Q4 differently
    if(tempdate>2000 & tempdate<=2009) {cols <- cols_00}
    if(i==88) {cols<-cols_09}
    if(tempdate>2009) {cols <- cols_09}
    
    tabname <- newtabnames[i]
    
    tempdata <- read_xlsx("QFR_raw_data.xlsx",sheet=tabname,col_names=FALSE,skip=3,
                          .name_repair = "unique_quiet")
    rowdata <- tempdata[(as.numeric(var_rows[5])-3),LettersToNumbers(cols)]
    rowdata <- matrix(unlist(rowdata),nrow=1)
    
    # For all
    var_level[(88+i),1:2] <- rowdata[1,1:2]
    var_level[(88+i),3] <- rowdata[1,1] - rowdata[1,2]
    
    # For durables
    var_level[(88+i),4:5] <- rowdata[1,3:4]
    var_level[(88+i),6] <- rowdata[1,3] - rowdata[1,4]
    
    # For nondurables
    var_level[(88+i),7:8] <- rowdata[1,5:6]
    var_level[(88+i),9] <- rowdata[1,5] - rowdata[1,6]
    
  }
  
  ##### Need to manually do SIC/NAICS conversion for growth rates from 2000Q4-2001Q3
  tempdata_all <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet="2001Q3_sic",col_names=FALSE,
                                   range=paste0(cols_81[1,1],var_rows[6],":",cols_81[1,2],var_rows[6]),
                                   .name_repair = "unique_quiet"))
  
  tempdata_big <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet="2001Q3_sic",col_names=FALSE,
                                   range=paste0(cols_81[2,1],var_rows[7],":",cols_81[2,2],var_rows[7]),
                                   .name_repair = "unique_quiet"))
  
  tempdata_small <- tempdata_all - tempdata_big
  
  # Durables
  tempdata_all_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet="2001Q3_sic",col_names=FALSE,
                                       range=paste0(cols_81[3,1],var_rows[6],":",cols_81[3,2],var_rows[6]),
                                       .name_repair = "unique_quiet"))
  
  tempdata_big_dur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet="2001Q3_sic",col_names=FALSE,
                                       range=paste0(cols_81[4,1],var_rows[7],":",cols_81[4,2],var_rows[7]),
                                       .name_repair = "unique_quiet"))
  
  tempdata_small_dur <- tempdata_all_dur - tempdata_big_dur
  
  # Nondurables
  tempdata_all_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet="2001Q3_sic",col_names=FALSE,
                                          range=paste0(cols_81[5,1],var_rows[6],":",cols_81[5,2],var_rows[6]),
                                          .name_repair = "unique_quiet"))
  
  tempdata_big_nondur <- unlist(read_xlsx("QFR_raw_data.xlsx",sheet="2001Q3_sic",col_names=FALSE,
                                          range=paste0(cols_81[6,1],var_rows[7],":",cols_81[6,2],var_rows[7]),
                                          .name_repair = "unique_quiet"))
  
  tempdata_small_nondur <- tempdata_all_nondur - tempdata_big_nondur
  
  tempgrowth_01 <- cbind(tempdata_all,tempdata_big,tempdata_small,
                         tempdata_all_dur,tempdata_big_dur,tempdata_small_dur,
                         tempdata_all_nondur,tempdata_big_nondur,tempdata_small_nondur)
  
  tempgrowth_01 <- unname(tempgrowth_01) 
  
  growth_01 <- apply(tempgrowth_01,FUN=function(x) x/dplyr::lag(x,1),MARGIN=2)
  growth_01 <- unname(growth_01[-1,])
  
  
  
  ## Calculate growth from 1988Q2-2000Q3 (50 observations)
  middleblock <- var_level[89:139,]
  growth_88_01 <- apply(middleblock,FUN=function(x) x/dplyr::lag(x,1),MARGIN=2)[-1,]
  
  ## Combine with previous rates to give growth rates from 1966Q2-2001Q3
  newgrowth <- rbind(growth,growth_88_01,growth_01)
  
  ## Fill in levels for raw data
  raw_level[89:139,] <- middleblock
  raw_level[140:(dim(raw_level)[1]),] <- var_level[140:(dim(raw_level)[1]),]
  
  ## Combine the earlier growth rates to the later levels
  for(j in (dim(newgrowth)[1]):1){
    var_level[j,] <- var_level[(j+1),]/newgrowth[j,]
  }
  
  

  ## Specify column names; notation is "variable_size_type"
  ## Sizes: Total (_t), Large (_l), Small (_s)
  ## Types: All (_a), Durable (_d), Nondurable
  name_roots <- paste0(qfrvar,
                       c("_t_a","_l_a","_s_a","_t_d","_l_d","_s_d","_t_n","_l_n","_s_n"))
  
  colnames(var_level) <- paste0("nom_",name_roots)
  colnames(raw_level) <- paste0("nom_",name_roots)
  
  rvar_level <- var_level
  
  ## Specify price indices. By default, I use the BEA's nonresidential investment price index
  ##  for NPPE, and the GDP price index for all other variables
  if(qfrvar %in% c("nppe")){
    pricedata <- macrodata[,"Nonres_fixed_invest_price"]
  } else {pricedata <- macrodata[,"gdppi"]}
  
  
  for( i in 1:(dim(var_level)[2]) ){
    rvar_level[,i] <- var_level[,i]*100/pricedata
  } 
  
  
  colnames(rvar_level) <- paste0("real_",name_roots)
  
  ## Create individual data objects for each variable
  eval(parse(text=paste0(qfrvar,"_data <- var_level")))
  eval(parse(text=paste0(qfrvar,"_data_real <- rvar_level")))
  eval(parse(text=paste0(qfrvar,"_data_raw <- raw_level")))
  
  
  ## Combine all of the datasets into one matrix
  nom_data <- cbind(nom_data,var_level)
  real_data <- cbind(real_data,rvar_level)
  raw_data <- cbind(raw_data,raw_level)
  
  
}



##### Step 2: Create ratios #####

## Regressions only use durable, nondurable, and total manufacturing series
sectors <- c("_t_d","_t_n","_t_a")

## Short-term liability share (current liabilities/total liabilities)
cur_liab_ratios <- raw_data[,paste0("nom_cur_liab",sectors)]/raw_data[,paste0("nom_liab",sectors)]
colnames(cur_liab_ratios) <- paste0("cur_liab_ratio",sectors)

## Cashflow (net income/capital stock)
cashflow_ratios <- raw_data[,paste0("nom_net_income",sectors)]/raw_data[,paste0("nom_nppe",sectors)]
colnames(cashflow_ratios) <- paste0("cashflow_ratio",sectors)

## Dividend payout ratios (dividends/equity)
div_ratios <- raw_data[,paste0("nom_dividend",sectors)]/raw_data[,paste0("nom_equity",sectors)]
colnames(div_ratios) <- paste0("div_ratio",sectors)


##### Step 3: Export data for regressions #####

## Extract real NPPE and sales data 
regdata <- real_data[,c(paste0("real_sales",sectors),paste0("real_nppe",sectors))]

## Combine with ratios
qfr_cleaned_data <- cbind(regdata,cur_liab_ratios,cashflow_ratios,div_ratios)

## Save output to be used for regressions and plotting
write.csv(qfr_cleaned_data,"qfr_cleaned_data.csv")


